use "$temp/psid_master", clear

*********Various sample restrictions**********
keep if year>=2000 & year<=2018 //limit year range
keep if sex == 2 //women
keep if age>=15 & age<=40 //initial age range
drop state_born
ren state_raised state_born
replace state_born = . if state_born == 0 | state_born == 99

//define birth state as state of first observation if otherwise unavailable
sort uniqid year
replace state_born = state if state_born == . & uniqid[_n]!=uniqid[_n-1]
replace state_born = state_born[_n-1] if state_born[_n] == . & state_born[_n-1]!=. & uniqid[_n] == uniqid[_n-1]
keep if age>=22 & age<=40 //initial age range

//household heads and spouses
keep if relate == 1 | relate == 2

//requirement: everybody has to be observed at age 22 to stay in sample
gen flag_start = (age == 22)
bys uniqid: egen check = max(flag_start)
keep if check
drop check flag_start

//limit people that we don't miss for a single year
gen year_diff = 0
replace year_diff = year[_n] - year[_n-1] if uniqid[_n] == uniqid[_n-1]
bys uniqid: egen year_flag = max(year_diff)
drop if year_flag >=2 //limitation to yearly data! over 6k observations
drop year_diff year_flag //have about 770 women left at this point

//fill in a tiny number (like 3) of missing state observations
replace state = state[_n+1] if state[_n] == . & state[_n+1]!=. & uniqid[_n] == uniqid[_n+1]
replace state = state[_n-1] if state[_n] == . & state[_n-1]!=. & uniqid[_n] == uniqid[_n-1]

************fill in parent location information************
//prepare mother/father data
preserve
use "$temp/psid_master", clear
keep state uniqid year
ren uniqid uniqid_mother 
ren state state_mother
tempfile mom
save `mom'
restore

preserve
use "$temp/psid_master", clear
keep state uniqid year
ren uniqid uniqid_father 
ren state state_father
tempfile dad
save `dad'
restore

preserve
use "$temp/psid_location_mobility", clear
keep state uniqid year
drop if state == .
sort uniqid year
keep if uniqid!=uniqid[_n+1] //last year of observation
drop year
ren uniqid uniqid_mother
ren state state_mother_last
tempfile mom_last
save `mom_last'
restore

//merge and generate parent state variable
merge m:1 uniqid_mother year using `mom', keep(1 3) nogen
merge m:1 uniqid_father year using `dad', keep(1 3) nogen
merge m:1 uniqid_mother  using `mom_last', keep(1 3) nogen
gen state_parent = .
replace state_parent = state_mother if state_mother == state_father
replace state_parent = state_mother if state_mother!=. & state_father == .
replace state_parent = state_father if state_father!=. & state_mother == .

//this gets parent location for around 2/3 the sample. If not observed in a year, assume they didn't move
sort uniqid year
replace state_parent = state_parent[_n-1] if state_parent[_n] == . & state_parent[_n-1]!=. & uniqid[_n] == uniqid[_n-1]
replace state_parent = state_parent[_n+1] if state_parent[_n] == . & state_parent[_n+1]!=. & uniqid[_n] == uniqid[_n+1]
replace state_parent = state_mother_last if state_parent == .

count if state_parent!=.
count if state_parent!=. & state_parent == state_born

replace state_parent = state_born if state_parent == . //last-ditch: use state of birth as a proxy. In practice, seems pretty likely
drop state_mother state_father uniqid_mother uniqid_father state_mother_last

sort uniqid year

//let's see how much we can detect in terms fo migration
preserve
sort uniqid year
gen uniqid_next = uniqid[_n+1]
gen state_next = state[_n+1]
replace state_next = state if uniqid!=uniqid_next
gen state_parent_next = state_parent[_n+1]
replace state_parent_next = state_parent if uniqid!=uniqid_next


count if state!=state_next & uniqid == uniqid_next 
count if state!=state_next & uniqid == uniqid_next & state_next == state_parent //# of home moves
count if state == state_next & uniqid == uniqid_next & state_parent_next!=state_parent & state_parent_next == state
count if uniqid == uniqid_next & state_parent_next!=state_parent 



//Age of Youngest Child
forval i = 1/18{
	gen age_child_`i' = year - birth_year_`i'
	replace age_child_`i' = . if age_child_`i'<0
}
egen ac = rowmin(age_child*)
replace ac = -1 if ac == . //reset to null state if no young children
replace ac = -1 if ac>4 //reset to null state if no young children
drop age_child* 

//number of children
gen nchild = 0
forval i = 1/18{
	replace nchild = nchild + 1 if year - birth_year_`i'>=0 & birth_year_`i'!=.
}

//Fertility Status
gen f = 0
replace f = 1 if ac[_n+1] == 0 & uniqid[_n] == uniqid[_n+1] //update


tab ac if state == state_next & uniqid == uniqid_next & state_parent_next!=state_parent & state_parent_next == state
tab f if state == state_next & uniqid == uniqid_next & state_parent_next!=state_parent & state_parent_next == state




restore


***************fill in spouse information if married************
preserve
use "$temp/psid_master", clear
keep if year>=2000 & year<=2018 //limit year range
keep if sex == 1 //men
keep if relate == 1 //household heads
keep year fam wage hours uniqid educ

//spousal education
bys uniqid: egen max_educ = max(educ)
gen e_sp = 0
replace e_sp = 1 if max_educ>=16 //college dummy
drop max_educ educ

//a bit of data cleaning
duplicates tag year fam, gen(dup)
drop if wage == . & dup
drop dup

duplicates tag year fam, gen(dup)
drop if dup //kill remaning year-fam duplicates
drop dup

ren wage wage_hus
ren hours hours_hus
ren uniqid uniqid_hus //unique identifier with which to obtain husband individual FE
tempfile hus
save `hus'
restore

//add on husband information for married women
preserve
keep if relate == 2 //keep spouses
merge 1:1 year fam using `hus', keep(1 3) nogen
tempfile spouse_filled
save `spouse_filled'
restore

//swap out data for women who have spouses
drop if relate == 2
append using `spouse_filled'
sort uniqid year

//hours correction
replace hours = 0 if hours == .
replace hours_hus = 0 if hours_hus == . & uniqid_hus!=. & relate == 2

//wage corrections for wife and husband
replace wage = (wage[_n+1] + wage[_n-1]) / 2 if wage[_n] == . & wage[_n-1]!=. & wage[_n+1] !=. & hours[_n]>0 & uniqid[_n-1] == uniqid[_n+1]
replace wage = wage[_n-1] if wage[_n] == . & wage[_n-1]!=. & hours[_n]>0 & uniqid[_n] == uniqid[_n-1]
replace wage = wage[_n+1] if wage[_n] == . & wage[_n+1]!=. & hours[_n]>0 & uniqid[_n] == uniqid[_n+1]
su wage 
replace wage = `r(mean)' if wage == . & hours>0

//husband correction
replace wage_hus = (wage_hus[_n+1] + wage_hus[_n-1]) / 2 if wage_hus[_n] == . & wage_hus[_n-1]!=. & wage_hus[_n+1] !=. & hours_hus[_n]>0 & uniqid_hus[_n-1] == uniqid_hus[_n+1] & hours_hus!=.
replace wage_hus = wage_hus[_n-1] if wage_hus[_n] == . & wage_hus[_n-1]!=. & hours_hus[_n]>0 & uniqid_hus[_n] == uniqid_hus[_n-1] & hours_hus!=.
replace wage_hus = wage_hus[_n+1] if wage_hus[_n] == . & wage_hus[_n+1]!=. & hours_hus[_n]>0 & uniqid_hus[_n] == uniqid_hus[_n+1] & hours_hus!=.
su wage_hus 
replace wage_hus = `r(mean)' if wage_hus == . & hours_hus>0 & hours_hus!=.

//fix up outliers
local vars `"wage wage_hus hours hours_hus"'
foreach var in `vars'{
	su `var', d
	replace `var' = `r(p99)' if `var'>`r(p99)' & `var'!=. //cap things at 99th percentile
}

****************Creation of Other Variables****************
//Education
bys uniqid: egen max_educ = max(educ)
gen e = 0
replace e = 1 if max_educ>=16 //college dummy
drop max_educ educ

//Last-Period LFP 
replace hours = hours/52
gen p = 0
replace p = 1 if hours[_n-1]>30 & uniqid[_n-1] == uniqid[_n]
replace p = 1 if age == 22 //assume p = 1 at age 22 for ease

//current hours selection
gen h = 0
replace h = 1 if hours>30

//Dummy Indicating Parent being in home location
gen lph = 0
replace lph = 1 if state_born == state_parent //update

//Experience
gen x = .
replace x = 0 if age == 22 & e //fill in age-22 value, as everyboyd is observed at 22
replace x = 4 if age == 22 & !e //assume worked 4 years by 22 if no college
replace x = x[_n-1] + h[_n-1] if uniqid[_n] == uniqid[_n-1] //track progression

//Age of Youngest Child
forval i = 1/18{
	gen age_child_`i' = year - birth_year_`i'
	replace age_child_`i' = . if age_child_`i'<0
}
egen ac = rowmin(age_child*)
replace ac = -1 if ac == . //reset to null state if no young children
replace ac = -1 if ac>4 //reset to null state if no young children
drop age_child* 

//number of children
gen nchild = 0
forval i = 1/18{
	replace nchild = nchild + 1 if year - birth_year_`i'>=0 & birth_year_`i'!=.
}

//Fertility Status
gen f = 0
replace f = 1 if ac[_n+1] == 0 & uniqid[_n] == uniqid[_n+1] //update

ren state stfips
merge m:1 stfips using "$data/Crosswalks/state_div_crosswalk", keep(match) nogen
ren division div
ren stfips state
drop if state == 11

ren state_parent stfips
merge m:1 stfips using "$data/Crosswalks/state_div_crosswalk", keep(match) nogen
ren division div_parent
ren stfips state_parent
drop if state_parent == 11



//let's see how much we can detect in terms fo migration
preserve
sort uniqid year
gen uniqid_next = uniqid[_n+1]
gen div_next = div[_n+1]
replace div_next = div if uniqid!=uniqid_next
gen div_parent_next = div_parent[_n+1]
replace div_parent_next = div_parent if uniqid!=uniqid_next


count if div!=div_next & uniqid == uniqid_next 
count if div!=div_next & uniqid == uniqid_next & div_next == div_parent //# of home moves
count if div==div_next & uniqid == uniqid_next  & div_parent_next!=div_parent & div_parent_next == div
count if uniqid == uniqid_next & div_parent_next!=div_parent 



tab ac if div==div_next & uniqid == uniqid_next  & div_parent_next!=div_parent & div_parent_next == div
tab f if div==div_next & uniqid == uniqid_next  & div_parent_next!=div_parent & div_parent_next == div




restore


/*
//Location Child Care Costs -- Starting Location
ren state statefips
merge m:1 statefips using "$temp/state_ccc", keep(match) nogen
gen d_l = 0
replace d_l = 1 if ccc_quant == 2
drop ccc_quant ccc 
ren statefips state

//Location Child Care Costs -- Parent
ren state_parent statefips
merge m:1 statefips using "$temp/state_ccc", keep(match) nogen
gen d_lp = 0
replace d_lp = 1 if ccc_quant == 2
drop ccc_quant ccc 
ren statefips state_parent

//Location Child Care Costs -- Home
ren state_born statefips
merge m:1 statefips using "$temp/state_ccc", keep(match) nogen
gen d_lh = 0
replace d_lh = 1 if ccc_quant == 2
drop ccc_quant ccc 
ren statefips state_born
*/

sort uniqid year

//Coding of migration variable
gen state_prime = .
replace state_prime = state[_n+1] if uniqid[_n] == uniqid[_n+1]
replace state_prime = state if age == 35 & state_prime == . //terminal age; assume no migration
gen mig = . //code dummy migration variable
replace mig = 0 if state == state_prime
replace mig = 1 if state != state_prime & state_prime!=.
tab mig

/*
//Model-version location variables
//current location
gen l = 0 //baseline: parent location
replace l = 1 if state == state_born & state!=state_parent //home
replace l = 2 if state != state_born & state!=state_parent //other

//next-location choice
gen l_prime = -1 //undetermined
replace l_prime = 1 if state == state_prime //staying option
replace l_prime = 2 if state!=state_prime & state_prime == state_parent //parent move
replace l_prime = 3 if state!=state_prime & state_prime == state_born & state_born!=state_parent //home move
replace l_prime = 4 if state!=state_prime & state_prime != state_born & state_prime != state_parent & state_prime!=. & d_l[_n+1] == 0 //other, low
replace l_prime = 5 if state!=state_prime & state_prime != state_born & state_prime != state_parent & state_prime!=. & d_l[_n+1] == 1 //other, high
*/

//observation count here: 9109

*************Woman and spouse wage fixed effects*************
gen mu = 0
gen m = 0
gen x2 = x^2
gen age2 = age^2
gen lwage = log(wage)
gen lwage_hus = log(wage_hus)

*************women*************
//obtain fixed effecdts
xtset uniqid
xtreg lwage i.year x x2 if e, cluster(uniqid) fe
predict mu_coll, u

xtreg lwage i.year x x2 if !e, cluster(uniqid) fe
predict mu_hs, u

//discretize
gen temp = mu_coll
replace temp = mu_hs if temp ==. & mu_hs!=.
xtile temp_quant = temp, nq(2)
replace mu = temp_quant
replace mu = 1 if mu == . //assume low type if a wage is never observed

//store discretized values and see how they perform
gen mu_val = .
su temp if temp_quant == 1
replace mu_val = `r(mean)' if mu == 1

su temp if temp_quant == 2
replace mu_val = `r(mean)' if mu == 2
reg lwage e x x2 mu_val, robust
drop temp temp_quant mu_coll mu_hs //clean up

*************men*************
xtset, clear
xtset uniqid_hus

//run regressions
xtreg lwage_hus i.year age age2 if e, cluster(uniqid_hus) fe
predict mu_coll, u

xtreg lwage_hus i.year age age2 if !e, cluster(uniqid_hus) fe
predict mu_hs, u

//discretize
gen temp = mu_coll
replace temp = mu_hs if temp ==. & mu_hs!=.
xtile temp_quant = temp, nq(2)
replace m = temp_quant if temp_quant !=. //update m to be either 1 or 2 (low or high FE)
replace m = 1 if m == . & uniqid_hus!=. //assume low type if never observed to work

//store discretized values and see how they perform
gen m_val = .
su temp if temp_quant == 1
replace m_val = `r(mean)' if m == 1
su temp if temp_quant == 2
replace m_val = `r(mean)' if m == 2

reg lwage_hus e age age2 m_val, robust //regression!
drop temp temp_quant mu_coll mu_hs //clean up


drop sex birth_year*
order uniqid year fam race wage hours wage_hus hours_hus state state_born state_parent mu e m p age x ac f e_sp




/*
//4/21/2021 tweaks: add on wage effects, delete variables related to home state
ren state statefip
merge m:1 statefip using "$temp/state_wage_types", keep(match) nogen
ren coef_quant nu_state
ren statefip state
ren state_parent statefip
merge m:1 statefip using "$temp/state_wage_types", keep(match) nogen
ren coef_quant nu_state_parent
ren statefip state_parent

drop state_born lph d_lh 
replace d_l = d_l + 1
ren d_l ccc
replace d_lp = d_lp + 1
ren d_lp ccc_p
*/




gen l = .
replace l = 1 if state == state_parent
replace l = div + 1 if state!=state_parent

gen lp_type = div_parent


/*
//recode location
replace l = .
replace l = 1 if state == state_parent
replace l = 2 if state!=state_parent & (nu_state == 1 & ccc == 1)
replace l = 3 if state!=state_parent & (nu_state == 2 & ccc == 1)
replace l = 4 if state!=state_parent & (nu_state == 1 & ccc == 2)
replace l = 5 if state!=state_parent & (nu_state == 2 & ccc == 2)

//parent location type
gen lp_type = .
replace lp_type = 1 if (nu_state_p == 1 & ccc_p == 1)
replace lp_type = 2 if (nu_state_p == 2 & ccc_p == 1)
replace lp_type = 3 if (nu_state_p == 1 & ccc_p == 2)
replace lp_type = 4 if (nu_state_p == 2 & ccc_p == 2)

//next location typie
ren state_prime statefip
merge m:1 statefip using "$temp/state_wage_types", keep(1 3) nogen
ren coef_quant nu_state_prime
ren statefip statefips
merge m:1 statefips using "$temp/state_ccc", keep(1 3) nogen
ren ccc_quant ccc_prime
ren statefips state_prime
*/

ren state_prime stfips
merge m:1 stfips using "$data/Crosswalks/state_div_crosswalk",  keep(1 3) nogen
ren division div_prime
ren stfips state_prime
gen l_prime = 99
replace l_prime = 11 if state 	== state_prime //stay
replace l_prime = 1 if state_prime == state_parent & state !=state_parent
replace l_prime = div_prime + 1 if state!=state_prime & state_prime!=state_parent & state_prime!=.


/*
replace l_prime = 99 //not observed; don't consider in likelihood
replace l_prime = 6 if state == state_prime //staying
replace l_prime = 1 if state_prime == state_parent & state !=state_parent
replace l_prime = 2 if state!=state_prime & state_prime!=state_parent & (nu_state_prime == 1 & ccc_prime == 1)
replace l_prime = 3 if state!=state_prime & state_prime!=state_parent & (nu_state_prime == 2 & ccc_prime == 1)
replace l_prime = 4 if state!=state_prime & state_prime!=state_parent & (nu_state_prime == 1 & ccc_prime == 2)
replace l_prime = 5 if state!=state_prime & state_prime!=state_parent & (nu_state_prime == 2 & ccc_prime == 2)
*/




sort uniqid year

//estimation sample
//keep uniqid year mu e m p l age x ac f lp_type h wage l_prime
//conversion to indices

tab e_sp

ds e m p f e_sp
foreach var in `r(varlist)'{
	replace `var' = `var' + 1
}

tab e_sp
replace e_sp = 0 if e_sp == .
tab e_sp 


replace ac = ac + 2
replace x = x+1 //experience: 0 is the first index 
replace x = 22 if x>22 //top-code
replace age = age-21
//drop if age == 15 //don't want to do this just yet.
replace wage = 0 if wage == .


//trim tails of allowed wages
replace wage = 7.25 if wage<7.25 & wage!=0
su wage, d
replace wage = `r(p95)' if wage>`r(p95)'
su wage, d


//only keep if observed age at 22
bys uniqid: egen minage = min(age)
keep if minage == 1
drop minage

replace h = 0 if wage == 0 //assume a mistake here
replace wage = 0 if h == 0 //IMPORTANT THING I MISSED . RECODE WAGE TO UNOBSERVED IF NOT CODED AS WORKING

//fix up age variable. Sometimes doens't advance even when year does. Correct to keep things 
//consistent between estimation and simulation sample and avoid duplicates
sort uniqid year
replace age = age[_n-1] + 1 if uniqid[_n-1] == uniqid[_n] & year[_n-1] == year[_n] - 1





//drop if age==16

//more fixing: again delete peole with missing years
duplicates report uniqid age //checks out; actually changes a decent number of obsevations . . .
sort uniqid year
gen year_diff = 0
replace year_diff = year[_n] - year[_n-1] if uniqid[_n] == uniqid[_n-1]
bys uniqid: egen year_flag = max(year_diff)
drop if year_flag >=2 //limitation to yearly data! over 6k observations
drop year_diff year_flag //have about 770 women left at this point




//even MORE fixing: recode l_prime to 99 if person is at oldest year of observation and state_prime not observed next period
bys uniqid: egen maxage = max(age)
replace l_prime = 99 if age == maxage & state_prime==.
//drop maxage
tab maxage
drop if maxage<4

//merge m:1 uniqid using "$temp/psid_weights", keep(match) nogen
gen weight = 1

//obsolete, but keeping this just to avoid breaking more code
bys uniqid: egen everwork = max(h)

//recode people to white or black if ever coded as suchs
gen temp = (race == 1)
bys uniqid: egen white_ever = max(temp)
replace race = 1 if white_ever & race!=1
drop temp white_ever


gen temp = (race == 2)
bys uniqid: egen black_ever = max(temp)
replace race = 2 if black_ever & race==3
drop temp black_ever

//the people with race = 3 seem super low-income, even more so than Blacks. I'm going to drop them.
gen temp = (race == 3)
bys uniqid: egen race_3_ever = max(temp)
drop if race_3_ever
drop temp

//order and sort
order uniqid year mu e m p l age x ac f lp_type h wage l_prime maxage weight everwork e_sp
sort uniqid year

tab e_sp if m==1
tab e_sp if m==2
tab e_sp if m==3

replace e_sp = 0 if m==1

//main
preserve
keep uniqid year mu e m p l age x ac f lp_type h wage l_prime maxage weight everwork e_sp
export delimited "$dir/Model/utilities/estimation_sample.csv", novarn replace
restore

//whites
preserve
keep if race == 1
keep uniqid year mu e m p l age x ac f lp_type h wage l_prime maxage weight everwork e_sp
export delimited "$dir/Model/utilities/estimation_sample_whites.csv", novarn replace
save "$temp/psid_estimation_sample_whites", replace


//bys uniqid: egen maxage = max(age)
keep if (age == 1 & e == 1) | (age == 4 & e == 2) //first year
export delimited "$dir/Model/utilities/simulation_sample_whites.csv", novarn replace
restore

//blacks
preserve
keep if race == 2
keep uniqid year mu e m p l age x ac f lp_type h wage l_prime maxage weight everwork e_sp
export delimited "$dir/Model/utilities/estimation_sample_blacks.csv", novarn replace
save "$temp/psid_estimation_sample_blacks", replace

//bys uniqid: egen maxage = max(age)
keep if (age == 1 & e == 1) | (age == 4 & e == 2) //first year
export delimited "$dir/Model/utilities/simulation_sample_blacks.csv", novarn replace
restore


preserve
keep uniqid year mu e m p l age x ac f lp_type h wage l_prime maxage weight everwork race e_sp
//labeling
lab var mu "Individual Fixed Effect Type"
lab var e "College dummy"
lab var m "Marriage"
lab var p "Past participation"
lab var l "Location"
lab var age "Age"
lab var x "Experience"
lab var ac "Age of youngest child"
lab var f "Pregnancy status"
lab var lp_type "Parent state characteristics"
lab var h "Labor force participation decision"
lab var wage "Hourly wage"
lab var l_prime "Next-period location"
lab var race "Race"
lab var e_sp "Spousal Education"
save "$temp/psid_estimation_sample", replace
restore


//simulation sample
preserve
keep uniqid year mu e m p l age x ac f lp_type h wage l_prime maxage weight everwork e_sp
//bys uniqid: egen maxage = max(age)
keep if (age == 1 & e == 1) | (age == 4 & e == 2) //first year
export delimited "$dir/Model/utilities/simulation_sample.csv", novarn replace
restore

